knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.5     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
attendance <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/attendance.csv")
standings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/standings.csv")

attendance_joined <- attendance %>%
  left_join(standings,
    by = c("year", "team_name", "team")
  )

attendance_joined

Explore data

attendance_joined %>%
  filter(!is.na(weekly_attendance)) %>%
  ggplot(aes(fct_reorder(team_name, weekly_attendance),
    weekly_attendance,
    fill = playoffs
  )) +
  geom_boxplot(outlier.alpha = 0.5) +
  coord_flip() +
  labs(
    fill = NULL, x = NULL,
    y = "Weekly NFL game attendance"
  )

attendance_joined %>%
  distinct(team_name, year, margin_of_victory, playoffs) %>%
  ggplot(aes(margin_of_victory, fill = playoffs)) +
  geom_histogram(position = "identity", alpha = 0.7) +
  labs(
    x = "Margin of victory",
    y = "Number of teams"
  )

attendance_joined %>%
  mutate(week = factor(week)) %>%
  ggplot(aes(week, weekly_attendance, fill = week)) +
  geom_boxplot(show.legend = FALSE, outlier.alpha = 0.5) +
  labs(
    x = "Week of NFL season",
    y = "Weekly NFL game attendance"
  )

attendance_df <- attendance_joined %>%
  filter(!is.na(weekly_attendance)) %>%
  select(
    weekly_attendance, team_name, year, week,
    margin_of_victory, strength_of_schedule, playoffs
  )

attendance_df

Build simple models

library(tidymodels)

set.seed(1234)

attendance_split <- attendance_df %>%
  initial_split(strata = playoffs)

nfl_train <- training(attendance_split)
nfl_test <- testing(attendance_split) 
lm_spec <- linear_reg() %>%
  set_engine(engine = "lm")

lm_spec
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm
lm_fit <- lm_spec %>%
  parsnip::fit(weekly_attendance ~ .,
    data = nfl_train
  )

lm_fit
## parsnip model object
## 
## Fit time:  64ms 
## 
## Call:
## stats::lm(formula = weekly_attendance ~ ., data = data)
## 
## Coefficients:
##          (Intercept)        team_nameBears      team_nameBengals  
##            -81107.86              -2879.80              -4875.47  
##       team_nameBills      team_nameBroncos       team_nameBrowns  
##              -361.08               2805.24               -324.11  
##  team_nameBuccaneers    team_nameCardinals     team_nameChargers  
##             -3063.65              -6139.80              -6489.31  
##      team_nameChiefs        team_nameColts      team_nameCowboys  
##              1974.33              -3392.79               6068.70  
##    team_nameDolphins       team_nameEagles      team_nameFalcons  
##               139.68               1259.16               -204.17  
##      team_nameGiants      team_nameJaguars         team_nameJets  
##              5447.10              -3095.46               4044.23  
##       team_nameLions      team_namePackers     team_namePanthers  
##             -3480.69               1114.11               1227.32  
##    team_namePatriots      team_nameRaiders         team_nameRams  
##              -214.20              -6324.74              -2884.85  
##      team_nameRavens     team_nameRedskins       team_nameSaints  
##              -398.90               6447.07                380.98  
##    team_nameSeahawks     team_nameSteelers       team_nameTexans  
##             -1405.89              -3567.81                264.07  
##      team_nameTitans      team_nameVikings                  year  
##             -1118.23              -3183.08                 74.73  
##                 week     margin_of_victory  strength_of_schedule  
##               -72.83                137.58                230.74  
##     playoffsPlayoffs  
##              -427.94
rf_spec <- rand_forest(mode = "regression") %>%
  set_engine("ranger")

rf_spec
## Random Forest Model Specification (regression)
## 
## Computational engine: ranger
rf_fit <- rf_spec %>%
  parsnip::fit(weekly_attendance ~ .,
    data = nfl_train
  )

rf_fit
## parsnip model object
## 
## Fit time:  6.3s 
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1)) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      7656 
## Number of independent variables:  6 
## Mtry:                             2 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       74737868 
## R squared (OOB):                  0.08217606

Evaluate models

In-sample prediction:

results_train <- lm_fit %>%  ## linear model
  predict(new_data = nfl_train) %>%
  mutate(
    truth = nfl_train$weekly_attendance,
    model = "lm"
  ) %>%
  bind_rows(rf_fit %>%  ## random forest
    predict(new_data = nfl_train) %>%
    mutate(
      truth = nfl_train$weekly_attendance,
      model = "rf"
    ))

results_train

Out-of-sample prediction:

results_test <- lm_fit %>%
  predict(new_data = nfl_test) %>%
  mutate(
    truth = nfl_test$weekly_attendance,
    model = "lm"
  ) %>%
  bind_rows(rf_fit %>%
    predict(new_data = nfl_test) %>%
    mutate(
      truth = nfl_test$weekly_attendance,
      model = "rf"
    ))

results_test

In-sample results:

results_train %>%
  group_by(model) %>%
  rmse(truth = truth, estimate = .pred)

RF >> LM

Out-of-sample results:

results_test %>%
  group_by(model) %>%
  rmse(truth = truth, estimate = .pred)

LM ~ RF

i.e. the FF has overfit the data, but the LM has not.

Let’s visualise this:

results_test %>%
  mutate(train = "testing") %>%
  bind_rows(results_train %>%
    mutate(train = "training")) %>%
  ggplot(aes(truth, .pred, color = model)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_point(alpha = 0.5) +
  facet_wrap(~train) +
  labs(
    x = "Truth",
    y = "Predicted attendance",
    color = "Type of model"
  )

RHS: RF is much better LHS: Roughly equal

Resampling

set.seed(1234)
nfl_folds <- vfold_cv(nfl_train, strata = playoffs)

rf_res <- fit_resamples(
  rf_spec,
  weekly_attendance ~ .,
  nfl_folds,
  control = control_resamples(save_pred = TRUE)
)

rf_res %>%
  collect_metrics()
rf_res
rf_res %>%
  unnest(.predictions) %>%
  ggplot(aes(weekly_attendance, .pred, color = id)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_point(alpha = 0.5) +
  labs(
    x = "Truth",
    y = "Predicted game attendance",
    color = NULL
  )